require(quanteda)
require(stm)
require(mvtnorm)

#### covert the data to stm format ####
load("dfm_matrix.Rdata")

stm.data <- convert(dfm.matrix, to = "stm")
stm.data$meta$year <- factor(stm.data$meta$year)
stm.data$meta$party <- factor(stm.data$meta$party,
                              levels = c("others", "LDP", "JSP", "Komeito", "DSP", "JCP", "NFP", "DPJ", "left", "right"))
stm.data$meta$party.year <- paste(stm.data$meta$party, 
                                  stm.data$meta$year, sep = ".")

#### STM with Model (A.3) in Online Appendix H ####
set.seed(12345)
STM.result.competing <- stm(documents = stm.data$documents, 
                            vocab = stm.data$vocab, K = 75, 
                            prevalence = ~ female * competing + year + party + age + I(age ^ 2) + incumbent + wins, 
                            data = stm.data$meta)
save(STM.result.competing, file = "STM_result_competing.Rdata")

#### post-estimation simulation ####
## conduct the post-estimation regression
set.seed(12345)
post.simcompeting <- estimateEffect(formula(1:75 ~ female * competing + year + party + age + I(age ^ 2) + incumbent + wins), 
                                    STM.result.competing, stm.data$meta, nsims = 100)

## function implementing the procedure explained in Online Appendix A.3
predict.fun.competing <- function(result, sim.beta, alpha) {  # alpha indicates the significance level
  cdata <- result$data
  cdata[, "competing"] <- 1
  cdata[, "female"] <- 1
  cdata[, "female:competing"] <- 1
  design.matrix <- makeDesignMatrix(result$formula, result$data, cdata, sparse = FALSE)
  sim.result.1 <- tcrossprod(design.matrix, sim.beta)
  cdata[, "female"] <- 0
  cdata[, "female:competing"] <- 0
  design.matrix <- makeDesignMatrix(result$formula, result$data, cdata, sparse = FALSE)
  sim.result.2 <- tcrossprod(design.matrix, sim.beta)
  dif.sim.result.1 <- sim.result.1 - sim.result.2
  dif.sim.mean.1 <- mean(dif.sim.result.1)
  dif.sim.CI.1 <- quantile(colMeans(dif.sim.result.1), c(alpha / 2, 1 - alpha / 2))
  cdata[, "competing"] <- 0
  cdata[, "female"] <- 1
  design.matrix <- makeDesignMatrix(result$formula, result$data, cdata, sparse = FALSE)
  sim.result.1 <- tcrossprod(design.matrix, sim.beta)
  cdata[, "female"] <- 0
  design.matrix <- makeDesignMatrix(result$formula, result$data, cdata, sparse = FALSE)
  sim.result.2 <- tcrossprod(design.matrix, sim.beta)
  dif.sim.result.2 <- sim.result.1 - sim.result.2
  dif.sim.mean.2 <- mean(dif.sim.result.2)
  dif.sim.CI.2 <- quantile(colMeans(dif.sim.result.2), c(alpha / 2, 1 - alpha / 2))
  second.dif.sim.result <- dif.sim.result.1 - dif.sim.result.2
  second.dif.sim.mean <- mean(second.dif.sim.result)
  second.dif.sim.CI <- quantile(colMeans(second.dif.sim.result), c(alpha / 2, 1 - alpha / 2))
  c(dif.sim.mean.1, dif.sim.CI.1, dif.sim.mean.2, dif.sim.CI.2, 
    second.dif.sim.mean, second.dif.sim.CI)
}

## compute the FD between women and men conditioned by the existence of other women candidates
distinctive.topic.id <- c(42, 47, 58, 69, 46)  # see 1_main_analysis_model1.R
distinctive.topic.labels <- c("Policy support for mothers", "Gender equality",             
                              "Child care", "Pacifism", 
                              "Agricultural administration")

competing.sim.result <- matrix(NA, 5, 9)
rownames(competing.sim.result) <- distinctive.topic.labels
for (i in 1:5) {
  set.seed(12345)
  sim.beta <- do.call(rbind, 
                      lapply(post.simcompeting$parameters[[distinctive.topic.id[i]]], 
                             function(x) rmvnorm(100, x$est, x$vcov)))
  competing.sim.result[i, ] <- predict.fun.competing(post.simcompeting, sim.beta, 0.05)
}
round(competing.sim.result, 3)

# Figure A.14
cairo_pdf("Figure_A14.pdf", 
          width = 6.2, height = 3, pointsize = 8, family = "Helvetica")
layout(matrix(1:5, 1, 5))
par(mar = c(3, 2, 3, 2), lwd = 0.5)
for (i in 1:5) {
  plot(NULL, NULL, type = "n", bty = "L", 
       xlim = c(0.5, 3.5), ylim = c(-0.02, 0.04), 
       main = distinctive.topic.labels[i], 
       xlab = "", ylab = "", xaxt = "n", yaxt = "n")
  abline(h = 0, lty = 3, col = "gray50")
  for (j in 1:3) {
    points(j, competing.sim.result[i, 3 * j - 2], pch = 19)
    segments(j, competing.sim.result[i, 3 * j - 1], j, competing.sim.result[i, 3 * j])
  }
  axis(1, at = 1:3, labels = FALSE, lwd = 0.5)
  axis(2, lwd = 0.5)
  mtext(c("Yes", "No"), side = 1, line = 1, at = 1:2, cex = 0.8)
  mtext("Female > 1", side = 1, line = 2, at = 1.5, cex = 0.8)
  mtext("Second\ndif.", side = 1, line = 2, at = 3, cex = 0.8)
}
dev.off()

## check if the second difference for "policy support for mothers" is significant at the 10% level
set.seed(12345)
sim.beta <- do.call(rbind, 
                    lapply(post.simcompeting$parameters[[distinctive.topic.id[1]]], 
                           function(x) rmvnorm(100, x$est, x$vcov)))
competing.sim.result.10pct <- predict.fun.competing(post.simcompeting, sim.beta, 0.1)
competing.sim.result.10pct[8] > 0  # lower limit of the 90% confidence interval of the second difference